import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 读取用户评价数据
user_data = pd.read_csv('warehouse_evaluate.csv')

# 文本向量化
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(user_data['content'])

# LDA模型
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # 假设提取5个主题
lda.fit(X)

# 查看主题关键词
for index, topic in enumerate(lda.components_):
    print(f'Topic {index}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])